Ce notebook se focalise sur l'étude du lissage commencée par Abdel lors de la première mission
Ce notebook est construit de plusieurs parties :
Pour que le notebook fonctionne, il est nécessaire que le fichier project_config.yml soit situé au même niveau que ce notebook : ce fichier permet la connexion à la source de données.
Le notebook fonctionne sur Python 3.7 avec les bibliothèques suivantes :
Initialement, la résolution par la méthode Lasso consiste à résoudre un système d'équations qui correspond à un instant t donné.
Le principe du lissage est de prendre en compte les échantillons du passé. Ainsi, un lissage prenant en compte n échantillons correspond à un système d'équations pour les échantillons à t, t-1, ..., t-n+1
Le notebook se focalise seulement sur les données du SIRTA et sur une courte période
# handling postgres database
import psycopg2
import pandas.io.sql as sqlio
import pandas as pd
from sqlalchemy import create_engine
from io import StringIO
from sklearn.linear_model import Lasso
from sklearn import metrics
import numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import yaml
from IPython.display import clear_output
with open(r'.\project_config.yml') as file:
dbInfo = yaml.load(file, Loader=yaml.FullLoader)
HOSTNAME = dbInfo["project-database"]["hostname"]
DATABASE = dbInfo["project-database"]["name"]
USER = dbInfo["project-database"]["user"]
PASSWORD = dbInfo["project-database"]["password"]
PORT = "5432"
query = "SELECT * FROM public.profiles"
connection = psycopg2.connect(user = USER, password = PASSWORD, host = HOSTNAME, port = PORT, database = DATABASE)
df = sqlio.read_sql_query(query, connection)
connection.close()
--------------------------------------------------------------------------- OperationalError Traceback (most recent call last) <ipython-input-4-d48a41907233> in <module> 1 query = "SELECT * FROM public.profiles" ----> 2 connection = psycopg2.connect(user = USER, password = PASSWORD, host = HOSTNAME, port = PORT, database = DATABASE) 3 df = sqlio.read_sql_query(query, connection) 4 connection.close() ~\Anaconda3\envs\ineris\lib\site-packages\psycopg2\__init__.py in connect(dsn, connection_factory, cursor_factory, **kwargs) 125 126 dsn = _ext.make_dsn(dsn, **kwargs) --> 127 conn = _connect(dsn, connection_factory=connection_factory, **kwasync) 128 if cursor_factory is not None: 129 conn.cursor_factory = cursor_factory OperationalError: could not translate host name "13.postgresql.dev.asterix.heka.ai" to address: Unknown host
pro = pd.unique(df['profile'])
pro
array(['BBOA', 'HOA', 'LO-OOA', 'MO-OOA'], dtype=object)
df = df.pivot(index="mass", columns='profile', values='value')
df = df.reset_index()
df = df.rename(columns={"mass" : "amus"})
df
| profile | amus | BBOA | HOA | LO-OOA | MO-OOA |
|---|---|---|---|---|---|
| 0 | 13.0 | 0.00189 | 0.00129 | 0.00430 | 0.00387 |
| 1 | 15.0 | 0.05999 | 0.00033 | 0.03545 | 0.00616 |
| 2 | 16.0 | 0.00621 | 0.00074 | 0.00787 | 0.01103 |
| 3 | 17.0 | 0.03883 | 0.00461 | 0.04919 | 0.06893 |
| 4 | 18.0 | 0.15533 | 0.01844 | 0.19675 | 0.27572 |
| ... | ... | ... | ... | ... | ... |
| 67 | 96.0 | 0.00359 | 0.00499 | 0.00214 | 0.00233 |
| 68 | 97.0 | 0.00309 | 0.01834 | 0.00203 | 0.00185 |
| 69 | 98.0 | 0.00114 | 0.00634 | 0.00133 | 0.00122 |
| 70 | 99.0 | 0.00344 | 0.00393 | 0.00234 | 0.00186 |
| 71 | 100.0 | 0.00177 | 0.00084 | 0.00090 | 0.00072 |
72 rows × 5 columns
## SELECTION DE LA DATE
start_date = '2015-03-01'
end_date = '2015-03-15'
## FIN DE LA SELECTION DE LA DATE
sql = f"""SELECT to_char(date, 'YYYY-MM-DD HH24:00:00') FROM(SELECT date FROM public.data_receptor
WHERE date>='{start_date}' AND date<='{end_date}' GROUP BY 1 ORDER BY 1) AS foo;"""
connection = psycopg2.connect(user = USER, password = PASSWORD, host = HOSTNAME, port = PORT, database = DATABASE)
df_dates = sqlio.read_sql_query(sql, connection)
connection.close()
dates = df_dates.T.values[0]
connection = psycopg2.connect(user = USER, password = PASSWORD, host = HOSTNAME, port = PORT, database = DATABASE)
error_PMF_MAE = []
error_PMF_MSE = []
for date in dates:
query_MAE = f"""
SELECT date, AVG(error)
FROM public.model_error
WHERE model='PMF' AND error_type='MAE'
GROUP BY date
HAVING date = '{date}'
LIMIT 10
"""
error_MAE = sqlio.read_sql_query(query_MAE, connection).values
error_PMF_MAE.append(error_MAE[0][1])
query_MSE = f"""
SELECT date, AVG(error)
FROM public.model_error
WHERE model='PMF' AND error_type='MSE'
GROUP BY date
HAVING date = '{date}'
LIMIT 10
"""
error_MSE = sqlio.read_sql_query(query_MSE, connection).values
error_PMF_MSE.append(error_MSE[0][1])
connection.close()
connection = psycopg2.connect(user = USER, password = PASSWORD, host = HOSTNAME, port = PORT, database = DATABASE)
query = f"SELECT * FROM public.data_receptor WHERE date > '{start_date}' AND date <= '{end_date}'"
df_receptor = sqlio.read_sql_query(query, connection)
connection.close()
df_pivot = df_receptor.pivot_table(values="value", columns="mass", index="date")
df_pivot = df_pivot.sort_values("date")
df_array = df_pivot.reset_index().values
df_array = [i[1:] for i in df_array]
connection = psycopg2.connect(user = USER, password = PASSWORD, host = HOSTNAME, port = PORT, database = DATABASE)
query = f"SELECT * FROM public.regressor_results WHERE date > '{start_date}' AND date <= '{end_date}' AND model='PMF'"
df_regressor = sqlio.read_sql_query(query, connection)
connection.close()
list_pas = [1,3,5,7,9]
lissage_brute = []
for pas in list_pas:
result = []
for index, date in enumerate(dates[pas:]):
result.append(np.mean(df_array[index: index + pas], axis = 0))
lissage_brute.append(result)
result = []
result_construct = []
result_error_MAE = []
result_error_MSE = []
result_error_MAE_modified = []
result_error_MSE_modified = []
connection = psycopg2.connect(user = USER, password = PASSWORD, host = HOSTNAME, port = PORT, database = DATABASE)
for index_pas, pas in enumerate(list_pas):
contirubtion = [[] for i in pro]
contribution_mae = []
contribution_mse = []
contribution_mae_modified = []
contribution_mse_modified = []
result_construct.append([])
c = []
for index_date, i in enumerate(dates[pas:]):
date = i
## Collect the receptor data
sql = f"""SELECT * FROM public.data_receptor where date = '{date}' """
for k in range(1, pas):
date_intemediaire = df_dates.iloc[k].values[0]
sql += f"or date = '{date_intemediaire}'"
sql += "order by mass;"
df_receptor_data = sqlio.read_sql_query(sql, connection)
df_receptor_data = df_receptor_data[df_receptor_data.columns]
df_receptor_data_ref = df_receptor_data[df_receptor_data["date"]==i]["value"].values
cor = df.merge(df_receptor_data, left_on='amus', right_on='mass').drop(columns=['mass', 'amus'])
# Choose variables
X_train = cor[pro].values
y_train = cor['value'].values.reshape(-1,1)
# Training
# Model
alpha = 0.001
lasso = Lasso(fit_intercept=False, alpha=alpha, positive=True)
lasso.fit(X_train, y_train)
clear_output(wait=True)
print(pas, ' ', date)
for n, prof in enumerate(pro):
contirubtion[n].append(lasso.coef_[n])
# Constructed signal
data_profile = df[pro].values
construct = np.dot(data_profile, lasso.coef_)
result_construct[index_pas].append(construct)
# MAE error
error_mae = np.sum(np.abs(construct - df_receptor_data_ref)) / len(df_receptor_data_ref)
contribution_mae.append(error_mae)
# MSE error
error_mse = np.sum(np.abs(construct**2 - df_receptor_data_ref**2)) / len(df_receptor_data_ref)
contribution_mse.append(error_mse)
# MAE error with modified signal
error_mae = np.sum(np.abs(construct - lissage_brute[index_pas][index_date])) / len(construct)
contribution_mae_modified.append(error_mae)
c.append(np.abs(construct - lissage_brute[index_pas][index_date]))
# MSE error with modified signal
error_mse = np.sum(np.abs(construct**2 - lissage_brute[index_pas][index_date]**2)) / len(construct)
contribution_mse_modified.append(error_mse)
result.append(contirubtion)
result_error_MSE.append(contribution_mse)
result_error_MAE.append(contribution_mae)
result_error_MSE_modified.append(contribution_mse_modified)
result_error_MAE_modified.append(contribution_mae_modified)
connection.close()
print("Done")
9 2015-03-15 00:00:00 Done
np.sum(np.abs(result_construct[0][0] - lissage_brute[0][1]))
0.7606828130000003
len(result_error_MSE[0])
336
plt.rcParams["figure.figsize"] = (20,10)
for n, prof in enumerate(pro):
fig, ax = plt.subplots()
fig.suptitle(prof)
for i, pas in enumerate(list_pas):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result[i][n], label=f'{pas} sample(s)')
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[1:]], df_regressor[df_regressor["profile"] == prof].sort_values(by="date")["contribution"].values, label='PMF')
plt.xlabel("Date")
plt.ylabel("Contribution")
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
for n, prof in enumerate(pro):
fig, ax = plt.subplots()
fig.suptitle("PMF vs Lasso: " + prof)
for i, pas in enumerate(list_pas):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], np.abs(result[i][n] - df_regressor[df_regressor["profile"] == prof].sort_values(by="date")["contribution"].values[pas - 1:]), label=f'{pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,5)
for n, prof in enumerate(pro):
fig, ax = plt.subplots(int(np.ceil(len(list_pas) / 3)), min(3, len(list_pas)))
fig.suptitle(prof)
for i, pas in enumerate(list_pas):
row = int(np.floor(i / 3))
col = i % 3
if int(np.ceil(len(list_pas) / 3)) == 1:
ax[col].plot(result[i][n], label=f'{pas} sample(s)')
ax[col].grid()
ax[col].legend()
else:
ax[row][col].plot(result[i][n], label=f'{pas} sample(s)')
ax[row][col].grid()
ax[row][col].legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MAE error")
for i, pas in enumerate(list_pas):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_error_MAE[i], label=f' MAE error for {pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MSE error")
for i, pas in enumerate(list_pas):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_error_MSE[i], label=f' MSE error for {pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MAE error with modified signal")
for i, pas in enumerate(list_pas):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_error_MAE_modified[i], label=f' MAE error with modified signal error for {pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MSE error with modified signal")
for i, pas in enumerate(list_pas):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_error_MSE_modified[i], label=f' MSE error with modified signal for {pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MAE error")
i = 1
pas = list_pas[i]
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_error_MAE[i], label=f' MAE error for {pas} sample(s)')
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_error_MAE_modified[i], label=f' MAE error with modified for {pas} sample(s)')
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates], error_PMF_MAE, label="MAE error for PMF")
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MSE error")
i = 1
pas = list_pas[i]
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_error_MSE[i], label=f'MSE error for {pas} sample(s)')
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_error_MSE_modified[i], label=f'MSE error with modified for {pas} sample(s)')
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates], error_PMF_MSE, label="MSE error for PMF")
plt.grid()
plt.legend()
plt.show()
result_error_MAE[0]
[0.00917675225, 0.010149028278652265, 0.009088465316054428, 0.011385304053548428, 0.008309936944444445, 0.008101102583333334, 0.00787027236111111, 0.008820957944444444, 0.006551375569444444, 0.007850488874422194, 0.006202139194444445, 0.00801752988888889, 0.012217288777777776, 0.01130637689993861, 0.010906938490511078, 0.013298377267068805, 0.016004435463168532, 0.013658603606342612, 0.013124344065662315, 0.011405972302414295, 0.012504306591956092, 0.009168762669950207, 0.007843013458333332, 0.008754294888888887, 0.008233699875, 0.009875210516833285, 0.008329089625000001, 0.007958516, 0.005862781527777777, 0.007513051902777778, 0.007701971958333333, 0.007022214374999999, 0.008075176222222221, 0.006548809541666667, 0.007111030708333333, 0.0104128922065393, 0.007630821083333333, 0.008389002680555554, 0.009173112774542561, 0.010271431189631133, 0.012282294277777779, 0.011943332232688869, 0.011496787765467793, 0.013369258414765299, 0.01888633962710265, 0.018980407144527468, 0.018672382427019133, 0.015825020537175646, 0.013106526463671357, 0.014289330900994253, 0.01776646781890591, 0.017000282369586374, 0.020550371218815575, 0.02122255185005279, 0.019719566367129595, 0.015536355911723457, 0.015033113079631106, 0.012114757898184739, 0.012886547538487326, 0.010450772774125433, 0.00991741605957864, 0.01103472588724476, 0.0103147851654161, 0.009311959813054133, 0.012684828657823385, 0.015651500566779806, 0.016826657155775502, 0.013961407294045528, 0.008967578631329238, 0.01022504267585736, 0.010685625118582573, 0.014057277445311853, 0.012484742050167011, 0.014098400721063244, 0.013826875177975657, 0.010956017457791855, 0.011681862103980787, 0.012305492238376906, 0.012446254609493853, 0.014568108495860543, 0.01471278743875359, 0.01074818262708355, 0.011480925731793639, 0.014131972165586728, 0.011223597658613374, 0.009817701185892097, 0.009804848548146098, 0.00862222051388889, 0.010405436159172955, 0.012691529142005092, 0.016658527499764028, 0.018512955943449507, 0.022919873860324916, 0.021047373756386212, 0.023254018196059188, 0.021139525579270103, 0.02003305448929379, 0.018595377108284437, 0.021829106140828645, 0.022665790976760723, 0.020312464135149255, 0.01931270092275868, 0.02267861457258999, 0.025854636318688826, 0.023323205092881702, 0.02253377703609701, 0.015148968691366789, 0.01403677549221343, 0.013694888311163436, 0.011987564835332988, 0.013298664185496058, 0.011172752535329301, 0.012437210781683864, 0.019678254106958563, 0.02571574293399103, 0.02671081544606187, 0.02620265539861996, 0.028854184085540118, 0.03005282193929618, 0.031125272072987195, 0.02808359776927129, 0.031089888258741025, 0.03315006796380845, 0.025335151484940643, 0.029375154159251397, 0.031059011183435556, 0.026766261422467772, 0.02674430248549625, 0.024793594850047524, 0.023209733882333203, 0.020743163296932366, 0.020567681476068588, 0.01803233999007131, 0.016948940106183086, 0.021757893971489094, 0.018293418473393615, 0.020286372217160738, 0.02225600980399206, 0.02280937405303826, 0.02571479201250038, 0.0198566863328915, 0.03103882461142348, 0.02120094915240514, 0.022289394975284437, 0.02211947904816811, 0.020548833198827346, 0.02358418003513823, 0.024787435760050092, 0.02512340010238277, 0.024668159302975355, 0.023297085001484943, 0.025028877925539855, 0.028154039791565586, 0.02332204817112851, 0.02410142160924287, 0.02404047283859881, 0.021669306384047207, 0.021463765984569606, 0.02529137005985125, 0.024485352121739842, 0.028258800869260376, 0.03469032194836473, 0.03180960109275025, 0.028662993685846412, 0.0315225666575241, 0.02766051355098132, 0.03194524394976024, 0.028838512178113458, 0.03273259256024392, 0.0273226030427339, 0.03112579565229837, 0.0321747788597055, 0.030830933573470023, 0.028592887465417365, 0.029643993268986528, 0.028811661346906024, 0.02796024131039849, 0.02587350265728538, 0.02325320352128385, 0.023592950909521796, 0.02372693532302733, 0.026522582133681687, 0.021993167932206827, 0.016511029380405107, 0.02135367454970534, 0.02497667819317913, 0.025528822951271075, 0.027918281031863272, 0.028313317184835707, 0.023988715957148885, 0.01877780768028288, 0.014123326087478731, 0.010720414391589161, 0.010632760519344888, 0.009737662382573028, 0.009436944210882366, 0.00913395445807538, 0.01193756242210954, 0.009482363848517698, 0.011955503546803881, 0.01465239623231035, 0.013861689774185301, 0.013282848901504152, 0.0200397211357307, 0.017090860531509638, 0.014465112730765308, 0.014444197366253908, 0.014123502768403554, 0.020793652737096727, 0.025284682526484494, 0.024146963631276157, 0.023826126581062253, 0.02652750786277657, 0.028360437693774068, 0.02653886031902234, 0.026097762637575503, 0.025613142069529558, 0.025694318865789236, 0.02683250248782248, 0.026786684176916764, 0.027342095782756846, 0.031100744844429398, 0.028810294260281255, 0.031605864332673, 0.025617884312202358, 0.029347702455179717, 0.03249361824265118, 0.027224273789574576, 0.025082976831768397, 0.023341261994036792, 0.02250078690470935, 0.01958455603196216, 0.021693763793878194, 0.025887748353244207, 0.024915973263644135, 0.027656450738967872, 0.0267524439730632, 0.02765887913974885, 0.025288364062967313, 0.023460776006765008, 0.023915960359877357, 0.024263929706261084, 0.016889308950039813, 0.017403729015459987, 0.020852592973777992, 0.022652366667109076, 0.02319189076968614, 0.02011901960791321, 0.01915189656759683, 0.023225875423460433, 0.018425678028781672, 0.013440644322855183, 0.021081567613540964, 0.016141162403581617, 0.013304878739070379, 0.014049480703738262, 0.017437508259594385, 0.018732308088326113, 0.023904283914498378, 0.026767112161967834, 0.02529859773370502, 0.02571082003784476, 0.025232465311827004, 0.02074419706719716, 0.021692668820577337, 0.019973756290708466, 0.01885536105508771, 0.01906564383360522, 0.02005724375184272, 0.02445663861970126, 0.02324160318556373, 0.02192366923685546, 0.021248996161713305, 0.021081556429922196, 0.01795271130854589, 0.012380871820760538, 0.013642374315626458, 0.014647904753693585, 0.012594161356492828, 0.013051980904950109, 0.0135025403985607, 0.014978522338922353, 0.02280151788409626, 0.021460727230850678, 0.022447087633990854, 0.018612439325878846, 0.020188018419275554, 0.016975637316904244, 0.016654279846963667, 0.0156983838170537, 0.014583258889713865, 0.015671319075229312, 0.01349193836415812, 0.01418830793019368, 0.022271777050195268, 0.024556824343228725, 0.022962737597954738, 0.01973265197085287, 0.01823847560445247, 0.020540172725858803, 0.01841853486345462, 0.016844057195852597, 0.014693054497395627, 0.016230850068483415, 0.01659265267944789, 0.020577006857346498, 0.023630212884893042, 0.027986998864057713, 0.026453205384060787, 0.028938455714329472, 0.031047933627550817, 0.026945573743066, 0.02970094300094734, 0.027409905636792537, 0.028997633345257273, 0.03008710358148577, 0.025769143162033135, 0.020491026349918232, 0.020144785095216708, 0.019963869463433277, 0.02324672213900963, 0.019413610484352167, 0.02198141405063228, 0.018694940187398146, 0.019537081562138757, 0.018759798030540897, 0.017899154346136908, 0.016543337046186067, 0.016107746594595674, 0.021000714732136407, 0.024508809670957376, 0.02431469107202189, 0.025325010713440052, 0.02683521598368745, 0.028274729485001385, 0.02697482298544493]
Observations faites sur les différents niveaux de lissage :
Principe de la méthode LASSO
La pondération du lissage dans la méthode LASSO se fait en pondérant les échantillons du passé dans la fonction de coût (plus l'échantillons est loin dans le passé, plus l'impact est faible). Le système d'équation est quant à lui identique.
Dans un premier temps, une pondération linéaire est mise en place. Pour n échantillons pris en compte :
list_pas_2 = [1,3,5,7,9]
lissage_brute_2 = []
for pas in list_pas_2:
result = []
for index, date in enumerate(dates[pas:]):
result_inter = []
total_coef = 0
for i in range(pas):
coef = (i + 1) / pas
total_coef += coef
result_inter.append(coef*df_array[i + index])
result.append(np.sum(result_inter, axis=0) / total_coef)
lissage_brute_2.append(result)
result_2 = []
result_2_error_MAE = []
result_2_error_MSE = []
result_2_error_MAE_modified = []
result_2_error_MSE_modified = []
connection = psycopg2.connect(user = USER, password = PASSWORD, host = HOSTNAME, port = PORT, database = DATABASE)
for index_pas, pas in enumerate(list_pas_2):
contribution = [[] for i in pro]
contribution_mae = []
contribution_mse = []
contribution_mae_modified = []
contribution_mse_modified = []
for index_date, i in enumerate(dates[pas:]):
date = i
sql = f"""SELECT * FROM public.data_receptor where date = '{date}' """
for k in range(1, pas):
date_intemediaire = (datetime.strptime(date, '%Y-%m-%d %H:%M:%S')-timedelta(hours=k)).strftime('%Y-%m-%d %H:%M:%S')
sql += f"or date = '{date_intemediaire}' "
sql += "order by mass;"
df_receptor_data = sqlio.read_sql_query(sql, connection)
df_receptor_data = df_receptor_data[df_receptor_data.columns]
df_receptor_data_ref = df_receptor_data[df_receptor_data["date"]==i]["value"].values
# Ponderate values
sum_total = np.sum([i/pas for i in range(1,pas+1)])
for k in range(pas):
date_target = (datetime.strptime(date, '%Y-%m-%d %H:%M:%S')-timedelta(hours=k)).strftime('%Y-%m-%d %H:%M:%S')
df_receptor_data.loc[df_receptor_data["date"] == date_target, "coef"] = (pas - k) * 1/(pas * sum_total)
cor = df.merge(df_receptor_data, left_on='amus', right_on='mass').drop(columns=['mass', 'amus'])
X_train = cor[pro].values
y_train = cor['value'].values.reshape(-1,1)
for index, row in enumerate(X_train):
X_train[index] = np.dot(row, cor['coef'].iloc[index])
for index, row in enumerate(y_train):
y_train[index] = np.dot(row, cor['coef'].iloc[index])
alpha = 0.001
lasso = Lasso(fit_intercept=False, alpha=alpha, positive=True) # We train without intercept and we shoose to have only positive values
lasso.fit(X_train, y_train) #training the algorithm
clear_output(wait=True)
print(pas, ' ', date)
for n, prof in enumerate(pro):
contribution[n].append(lasso.coef_[n])
# Constructed signal
data_profile = df[pro].values
construct = np.dot(data_profile, lasso.coef_)
# MAE error
error_mae = np.sum(np.abs(construct - df_receptor_data_ref)) / len(df_receptor_data_ref)
contribution_mae.append(error_mae)
# MSE error
error_mse = np.sum(np.abs(construct**2 - df_receptor_data_ref**2)) / len(df_receptor_data_ref)
contribution_mse.append(error_mse)
# MAE error with modified
error_mae = np.sum(np.abs(construct - lissage_brute_2[index_pas][index_date])) / len(construct)
contribution_mae_modified.append(error_mae)
# MSE error with modified
error_mse = np.sum(np.abs(construct**2 - lissage_brute_2[index_pas][index_date]**2)) / len(construct)
contribution_mse_modified.append(error_mse)
result_2.append(contribution)
result_2_error_MSE.append(contribution_mse)
result_2_error_MAE.append(contribution_mae)
result_2_error_MSE_modified.append(contribution_mse_modified)
result_2_error_MAE_modified.append(contribution_mae_modified)
connection.close()
print("Done")
9 2015-03-15 00:00:00 Done
plt.rcParams["figure.figsize"] = (20,10)
for n, prof in enumerate(pro):
fig, ax = plt.subplots()
fig.suptitle(prof)
for i, pas in enumerate(list_pas_2):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_2[i][n], label=f'{pas} sample(s)')
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[1:]], df_regressor[df_regressor["profile"] == prof].sort_values(by="date")["contribution"].values, label='PMF')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
for n, prof in enumerate(pro):
fig, ax = plt.subplots()
fig.suptitle("Lasso VS PMF : " + prof)
for i, pas in enumerate(list_pas_2):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_2[i][n] - df_regressor[df_regressor["profile"] == prof].sort_values(by="date")["contribution"].values[pas-1:], label=f'{pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,5)
for n, prof in enumerate(pro):
fig, ax = plt.subplots(int(np.ceil(len(list_pas_2) / 3)), min(3, len(list_pas_2)))
fig.suptitle(prof)
for i, pas in enumerate(list_pas_2):
row = int(np.floor(i / 3))
col = i % 3
if int(np.ceil(len(list_pas_2) / 3)) == 1:
ax[col].plot(result_2[i][n], label=f'{pas} sample(s)')
ax[col].grid()
ax[col].legend()
else:
ax[row][col].plot(result_2[i][n], label=f'{pas} sample(s)')
ax[row][col].grid()
ax[row][col].legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MAE error")
for i, pas in enumerate(list_pas):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_2_error_MAE[i], label=f' MAE error for {pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MSE error")
for i, pas in enumerate(list_pas):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_2_error_MSE[i], label=f' MSE error for {pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MAE error with modified")
for i, pas in enumerate(list_pas):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_2_error_MAE_modified[i], label=f' MAE error with modified for {pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MSE error with modified")
for i, pas in enumerate(list_pas):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_2_error_MSE_modified[i], label=f' MSE error with modified for {pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MAE error linear ponderation")
i = 1
pas = list_pas[i]
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_2_error_MAE[i], label=f' MAE error for {pas} sample(s)')
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_2_error_MAE_modified[i], label=f' MAE error with modified for {pas} sample(s)')
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates], error_PMF_MAE, label="MAE error for PMF")
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MSE error linear ponderation")
i = 1
pas = list_pas[i]
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_2_error_MSE[i], label=f'MSE error for {pas} sample(s)')
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_2_error_MAE_modified[i], label=f'MSE error with modified for {pas} sample(s)')
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates], error_PMF_MSE, label="MSE error for PMF")
plt.grid()
plt.legend()
plt.show()
Observations faites sur les différents niveaux de lissage avec pondération :
Dans cette partie, une pondération exponentielle est mise en place. Pour n échantillons pris en compte :
list_pas_3 = [1,3,5,7,9]
taux = 0.15
lissage_brute_3 = []
for pas in list_pas_3:
result = []
for index, date in enumerate(dates[pas:]):
result_inter = []
total_coef = 0
for i in range(pas):
coef = taux**(pas - 1 - i)
total_coef += coef
result_inter.append(coef*df_array[i + index])
result.append(np.sum(result_inter, axis=0) / total_coef)
lissage_brute_3.append(result)
result_3 = []
result_3_error_MAE = []
result_3_error_MSE = []
result_3_error_MAE_modified = []
result_3_error_MSE_modified = []
connection = psycopg2.connect(user = USER, password = PASSWORD, host = HOSTNAME, port = PORT, database = DATABASE)
for index_pas, pas in enumerate(list_pas_3):
contribution = [[] for i in pro]
contribution_mae = []
contribution_mse = []
contribution_mae_modified = []
contribution_mse_modified = []
for index_date, i in enumerate(dates[pas:]):
date = i
sql = f"""SELECT * FROM public.data_receptor where date = '{date}' """
for k in range(1, pas):
date_intemediaire = (datetime.strptime(date, '%Y-%m-%d %H:%M:%S')-timedelta(hours=k)).strftime('%Y-%m-%d %H:%M:%S')
sql += f"or date = '{date_intemediaire}' "
sql += "order by mass;"
df_receptor_data = sqlio.read_sql_query(sql, connection)
df_receptor_data = df_receptor_data[df_receptor_data.columns]
df_receptor_data_ref = df_receptor_data[df_receptor_data["date"]==i]["value"].values
# Ponderate values
for k in range(pas):
date_target = (datetime.strptime(date, '%Y-%m-%d %H:%M:%S')-timedelta(hours=k)).strftime('%Y-%m-%d %H:%M:%S')
df_receptor_data.loc[df_receptor_data["date"] == date_target, "coef"] = taux**k
cor = df.merge(df_receptor_data, left_on='amus', right_on='mass').drop(columns=['mass', 'amus'])
X_train = cor[pro].values
y_train = cor['value'].values.reshape(-1,1)
for index, row in enumerate(X_train):
X_train[index] = np.dot(row, cor['coef'].iloc[index])
for index, row in enumerate(y_train):
y_train[index] = np.dot(row, cor['coef'].iloc[index])
alpha = 0.001
lasso = Lasso(fit_intercept=False, alpha=alpha, positive=True) # We train without intercept and we shoose to have only positive values
lasso.fit(X_train, y_train) #training the algorithm
clear_output(wait=True)
print(pas, ' ', date)
for n, prof in enumerate(pro):
contribution[n].append(lasso.coef_[n])
# Constructed signal
data_profile = df[pro].values
construct = np.dot(data_profile, lasso.coef_)
# MAE error
error_mae = np.sum(np.abs(construct - df_receptor_data_ref)) / len(df_receptor_data_ref)
contribution_mae.append(error_mae)
# MSE error
error_mse = np.sum(np.abs(construct**2 - df_receptor_data_ref**2)) / len(df_receptor_data_ref)
contribution_mse.append(error_mse)
# MAE error
error_mae = np.sum(np.abs(construct - lissage_brute_3[index_pas][index_date])) / len(construct)
contribution_mae_modified.append(error_mae)
# MSE error
error_mse = np.sum(np.abs(construct**2 - lissage_brute_3[index_pas][index_date]**2)) / len(construct)
contribution_mse_modified.append(error_mse)
result_3.append(contribution)
result_3_error_MSE.append(contribution_mse)
result_3_error_MAE.append(contribution_mae)
result_3_error_MSE_modified.append(contribution_mse_modified)
result_3_error_MAE_modified.append(contribution_mae_modified)
connection.close()
print("Done")
9 2015-03-15 00:00:00 Done
plt.rcParams["figure.figsize"] = (20,10)
for n, prof in enumerate(pro):
fig, ax = plt.subplots()
fig.suptitle(prof)
for i, pas in enumerate(list_pas_3):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_3[i][n], label=f'{pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,5)
for n, prof in enumerate(pro):
fig, ax = plt.subplots(int(np.ceil(len(list_pas_3) / 3)), min(3, len(list_pas_3)))
fig.suptitle(prof)
for i, pas in enumerate(list_pas_3):
row = int(np.floor(i / 3))
col = i % 3
if int(np.ceil(len(list_pas_3) / 3)) == 1:
ax[col].plot(result_3[i][n], label=f'{pas} sample(s)')
ax[col].grid()
ax[col].legend()
else:
ax[row][col].plot(result_3[i][n], label=f'{pas} sample(s)')
ax[row][col].grid()
ax[row][col].legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MAE error")
for i, pas in enumerate(list_pas):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_3_error_MAE[i], label=f' MAE error for {pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MSE error")
for i, pas in enumerate(list_pas):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_3_error_MSE[i], label=f' MSE error for {pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MAE error with exponential weights")
for i, pas in enumerate(list_pas):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_3_error_MAE_modified[i], label=f' MAE error with modified for {pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MSE error with exponential weights")
for i, pas in enumerate(list_pas):
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_3_error_MSE_modified[i], label=f' MSE error with modified for {pas} sample(s)')
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MAE error")
i = 1
pas = list_pas[i]
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_3_error_MAE[i], label=f' MAE error for {pas} sample(s)')
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_3_error_MAE_modified[i], label=f' MAE error with modified for {pas} sample(s)')
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates], error_PMF_MAE, label="MAE error for PMF")
plt.grid()
plt.legend()
plt.show()
plt.rcParams["figure.figsize"] = (20,10)
fig, ax = plt.subplots()
fig.suptitle("MSE error")
i = 1
pas = list_pas[i]
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_3_error_MSE[i], label=f'MSE error for {pas} sample(s)')
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates[pas:]], result_3_error_MSE_modified[i], label=f'MSE error with modified for {pas} sample(s)')
ax.plot([datetime.strptime(i, '%Y-%m-%d %H:%M:%S') for i in dates], error_PMF_MSE, label="MSE error for PMF")
plt.grid()
plt.legend()
plt.show()
Observations faites sur les différents niveaux de lissage avec pondération exponentielle :
plt.rcParams["figure.figsize"] = (20,5)
for n, prof in enumerate(pro):
fig, ax = plt.subplots(int(np.ceil(len(list_pas_2) / 3)), min(3, len(list_pas_2)))
fig.suptitle(prof)
for i, pas in enumerate(list_pas_2):
row = int(np.floor(i / 3))
col = i % 3
if int(np.ceil(len(list_pas_2) / 3)) == 1:
ax[col].plot(result[i][n], label=f'{pas} sample(s)')
ax[col].plot(result_2[i][n], label=f'{pas} sample(s) with linear ponderation')
ax[col].plot(result_3[i][n], label=f'{pas} sample(s) with exponential ponderation')
ax[col].grid()
ax[col].legend()
else:
ax[row][col].plot(result[i][n], label=f'{pas} sample(s)')
ax[row][col].plot(result_2[i][n], label=f'{pas} sample(s) with linear ponderation')
ax[row][col].plot(result_3[i][n], label=f'{pas} sample(s) with exponential ponderation')
ax[row][col].grid()
ax[row][col].legend()
plt.show()
data = []
for k, pas in enumerate(list_pas):
data.append([
pas,
np.mean(result_error_MAE[k]),
np.mean(result_2_error_MAE[k]),
np.mean(result_3_error_MAE[k])
])
data.append([
'PMF',
np.mean(error_PMF_MAE),
np.mean(error_PMF_MAE),
np.mean(error_PMF_MAE)
])
pd.DataFrame(data=data, columns=["Nb samples", "MAE error", "MAE error with linear ponderation", "MAE error with exponential ponderation"])
| Nb samples | MAE error | MAE error with linear ponderation | MAE error with exponential ponderation | |
|---|---|---|---|---|
| 0 | 1 | 0.019351 | 0.019351 | 0.019351 |
| 1 | 3 | 0.061331 | 0.054341 | 0.033360 |
| 2 | 5 | 0.069137 | 0.079972 | 0.042809 |
| 3 | 7 | 0.072975 | 0.081464 | 0.051178 |
| 4 | 9 | 0.075615 | 0.081910 | 0.058951 |
| 5 | PMF | 0.010537 | 0.010537 | 0.010537 |
data = []
for k, pas in enumerate(list_pas):
data.append([
pas,
np.mean(result_error_MAE_modified[k]),
np.mean(result_2_error_MAE_modified[k]),
np.mean(result_3_error_MAE_modified[k])
])
data.append([
'PMF',
np.mean(error_PMF_MAE),
np.mean(error_PMF_MAE),
np.mean(error_PMF_MAE)
])
pd.DataFrame(data=data, columns=["Nb samples", "MAE error with modified", "MAE error with modified (with linear ponderation)", "MAE error with modified (with exponential ponderation)"])
| Nb samples | MAE error with modified | MAE error with modified (with linear ponderation) | MAE error with modified (with exponential ponderation) | |
|---|---|---|---|---|
| 0 | 1 | 0.019351 | 0.019351 | 0.019351 |
| 1 | 3 | 0.060304 | 0.053542 | 0.033043 |
| 2 | 5 | 0.067448 | 0.078725 | 0.042510 |
| 3 | 7 | 0.070730 | 0.079802 | 0.050905 |
| 4 | 9 | 0.072900 | 0.079884 | 0.058692 |
| 5 | PMF | 0.010537 | 0.010537 | 0.010537 |
data = []
for k, pas in enumerate(list_pas):
data.append([
pas,
np.mean(result_error_MSE[k]),
np.mean(result_2_error_MSE[k]),
np.mean(result_3_error_MSE[k])
])
data.append([
'PMF',
np.mean(error_PMF_MSE),
np.mean(error_PMF_MSE),
np.mean(error_PMF_MSE)
])
pd.DataFrame(data=data, columns=["Nb samples", "MSE error", "MSE error with linear ponderation", "MSE error with exponential ponderation"])
| Nb samples | MSE error | MSE error with linear ponderation | MSE error with exponential ponderation | |
|---|---|---|---|---|
| 0 | 1 | 0.010858 | 0.010858 | 0.010858 |
| 1 | 3 | 0.064113 | 0.051524 | 0.025823 |
| 2 | 5 | 0.068671 | 0.070885 | 0.038035 |
| 3 | 7 | 0.070339 | 0.071624 | 0.047953 |
| 4 | 9 | 0.071333 | 0.072057 | 0.055761 |
| 5 | PMF | 0.010755 | 0.010755 | 0.010755 |
data = []
for k, pas in enumerate(list_pas):
data.append([
pas,
np.mean(result_error_MSE_modified[k]),
np.mean(result_2_error_MSE_modified[k]),
np.mean(result_3_error_MSE_modified[k])
])
data.append([
'PMF',
np.mean(error_PMF_MSE),
np.mean(error_PMF_MSE),
np.mean(error_PMF_MSE)
])
pd.DataFrame(data=data, columns=["Nb samples", "MSE error with modified", "MSE error with modified (with linear ponderation)", "MSE error with modified (with exponential ponderation)"])
| Nb samples | MSE error with modified | MSE error with modified (with linear ponderation) | MSE error with modified (with exponential ponderation) | |
|---|---|---|---|---|
| 0 | 1 | 0.010858 | 0.010858 | 0.010858 |
| 1 | 3 | 0.062149 | 0.050017 | 0.025336 |
| 2 | 5 | 0.064905 | 0.068053 | 0.037519 |
| 3 | 7 | 0.064935 | 0.067525 | 0.047439 |
| 4 | 9 | 0.064518 | 0.066798 | 0.055249 |
| 5 | PMF | 0.010755 | 0.010755 | 0.010755 |
Pour résumer :